In [186]:
from __future__ import print_function
import numpy as np
import pandas as pd
from collections import OrderedDict #sorting participant df dict before pd.concat()
import matplotlib.pylab as plt
from matplotlib import pylab
import matplotlib as mpl
%matplotlib inline
mpl.rcParams['figure.figsize'] = (14,8)
pd.options.display.mpl_style = 'default'
#pd.set_option('display.multi_sparse', True)
from pprint import pprint
from pprint import pformat
#pickled_dbase = "c:/db_pickles/pickle - dbase - 2014-07-28a.pickle"
#dbase = pd.read_pickle(pickled_dbase)
sms_tasknames = ['T1_SMS_5', 'T1_SMS_8',
'Ticks_ISO_T2_5', 'Ticks_ISO_T2_8',
'Ticks_Linear_5', 'Ticks_Linear_8',
'Ticks_Phase_5', 'Ticks_Phase_8',
'Jits_ISO_5', 'Jits_ISO_8',
'Jits_Linear_5', 'Jits_Linear_8',
'Jits_Phase_5', 'Jits_Phase_8', ]
# Participants that are excluded from all performance analysis
pilot_data = ['010', '011', '012', '013', '014',]
non_english_fluent = ['023', '031', '045', '050', '070', '106',]
left_handed = ['042', '088',]
pro_inst_skill = ['026', '037']
excluded_all_tasks = pilot_data + non_english_fluent + left_handed + pro_inst_skill
In [187]:
param_all_tasks = lambda v: {task: v for task in sms_tasknames}
sms_params_entry = {
'stimulus_timing': {
'T1_SMS_5': 'iso',
'Ticks_ISO_T2_5': 'iso',
'Ticks_Linear_5': 'linear',
'Ticks_Phase_5': 'phase',
'Jits_ISO_5': 'iso',
'Jits_Linear_5': 'linear',
'Jits_Phase_5': 'phase',
'T1_SMS_8': 'iso',
'Ticks_ISO_T2_8': 'iso',
'Ticks_Linear_8': 'linear',
'Ticks_Phase_8': 'phase',
'Jits_ISO_8': 'iso',
'Jits_Linear_8': 'linear',
'Jits_Phase_8': 'phase',
},
'stimulus_style': {
'T1_SMS_5': 'tick',
'Ticks_ISO_T2_5': 'tick',
'Ticks_Linear_5': 'tick',
'Ticks_Phase_5': 'tick',
'Jits_ISO_5': 'jitter',
'Jits_Linear_5': 'jitter',
'Jits_Phase_5': 'jitter',
'T1_SMS_8': 'tick',
'Ticks_ISO_T2_8': 'tick',
'Ticks_Linear_8': 'tick',
'Ticks_Phase_8': 'tick',
'Jits_ISO_8': 'jitter',
'Jits_Linear_8': 'jitter',
'Jits_Phase_8': 'jitter',
},
'ISI': {
'T1_SMS_5': 500,
'Ticks_ISO_T2_5': 500,
'Ticks_Linear_5': '(varies)',
'Ticks_Phase_5': 500,
'Jits_ISO_5': 500,
'Jits_Linear_5': '(varies)',
'Jits_Phase_5': 500,
'T1_SMS_8': 800,
'Ticks_ISO_T2_8': 800,
'Ticks_Linear_8': '(varies)',
'Ticks_Phase_8': 800,
'Jits_ISO_8': 800,
'Jits_Linear_8': '(varies)',
'Jits_Phase_8': 800,
},
#used in filtering step
'wait_beats_after_subj_start': param_all_tasks(12),
#used in assigning outlier status / "outlier metric"
'minimum_percent_deviation_to_keep': param_all_tasks(-35),
'maximum_percent_deviation_to_keep': param_all_tasks(+20),
}
#reshape to task>param so parameter lists can be selected by task
sms_params = {task: {param_type: taskparams[task]
for (param_type, taskparams)
in sms_params_entry.items()}
for task in sms_tasknames}
phase_shift_beats = {800: OrderedDict([
(30, -20),
(48, +20),
(64, +40),
(81, -40),
(97, -80),
(114, +80),
(131, +160),
(150, -160)]),
500: OrderedDict([
(64, +20),
(81, -20),
(97, -50),
(114, +50),
(131, +100),
(150, -100)])
}
In [188]:
short_name = {'T1_SMS_5': 'iso5t1',
'T1_SMS_8': 'iso8t1',
'Ticks_ISO_T2_5': 'iso5t2',
'Ticks_ISO_T2_8': 'iso8t2',
'Ticks_Linear_5': 'lin5t',
'Ticks_Linear_8': 'lin8t',
'Ticks_Phase_5': 'phase5t',
'Ticks_Phase_8': 'phase8t',
'Jits_ISO_5': 'iso5j',
'Jits_ISO_8': 'iso8j',
'Jits_Linear_5': 'lin5j',
'Jits_Linear_8': 'lin8j',
'Jits_Phase_5': 'phase5j',
'Jits_Phase_8': 'phase8j',
}
sms_shortnames = short_name.values()
long_name = {v: k for (k, v) in short_name.items()}
print(sms_shortnames)
In [189]:
def general_task_pid_iterator(label_tasks=True,
label_pids=True,
concise_labels=False,
skip_to_task=None,
skip_to_pid=None):
for t in sms_tasknames:
if skip_to_task:
if t != skip_to_task:
continue
else:
skip_to_task = None
if label_tasks:
if concise_labels:
print('\n' + t)
else:
print('='*80 + '\n' + t + '\n' + '='*80)
for pid in task_pids[t]:
if skip_to_pid:
if pid != skip_to_pid:
continue
else:
skip_to_pid = None
if label_pids:
if concise_labels:
print(pid, end=',')
else:
print('-'*60)
print('P. ' + pid)
yield (t, pid)
In [190]:
import cPickle as pickle
pfile = "c:/db_pickles/pickle - smsbeats - 2014-10-03b.pickle"
with open(pfile) as f:
task_frames = pickle.load(f)
task_pids = {}
for (k, v) in task_frames.items():
pids = sorted(set(v.index.get_level_values('pid')))
task_pids[k] = [p for p in pids if p not in excluded_all_tasks]
In [191]:
for t in task_frames.keys():
task_frames[t] = task_frames[t].drop(excluded_all_tasks, level='pid')
In [192]:
for k, v in task_pids.items():
print(k, '\t', len(v))
In [193]:
# This function was pulled out from the earlier processing step-- instead
# of deciding what's an outlier while doing the initial data processing,
# we can look at it here, and perhaps experiment with different settings.
# (don't just mindlessly maximize reliability, though-- there could certainly
# be reliable aspects of the data that we still want to remove-- e.g.,
# how many beats a P waits to start, how often they skip a tap...)
def filter_taps(df,
task_params,
print_results=False):
'''
Input: a DataFrame consisting of the unfiltered list of taps (without targets).
Output: the dataframe with outlying taps tagged and startup beats removed.
'''
# drop initial beats from task recording: [n] beats from start of task
# or [n] beats from the participant's first tap, whichever comes later
nonfail_beats = df[df.is_failure == False].index.tolist()
first_played_beat = min(nonfail_beats)
#beatdrop1 = task_params['wait_beats_after_task_start']
beatdrop2 = first_played_beat + task_params['wait_beats_after_subj_start']
beats_to_drop_from_start = beatdrop2 #max([beatdrop1, beatdrop2])
df = df[beats_to_drop_from_start:] #slice by index name (zero-indexed beats)
# temporarily remove outliers to form a distribution of typical
# values, which we'll use to form upper and lower limits for
# filtering in the following step.
# "from left" = the largest negative deviations,
# "from right" = the largest positive deviations
#nworst_left = task_params['stdev_calcs_exclude_n_from_left']
#nworst_right = task_params['stdev_calcs_exclude_n_from_right']
#df_adj = df[(df.dev_perc > max(df.dev_perc.nsmallest(nworst_left))) &
# (df.dev_perc < min(df.dev_perc.nlargest(nworst_right)))]
# Actual filtering of the values based on the temporary distribution
# created above. (This way, we retain the biggest deviations, as long
# as they aren't actually outliers.)
#rem_beyond_stds = task_params['filter_outliers_beyond_x_stdevs']
#trimmed_mean = df_adj.dev_perc.mean()
#trimmed_std = df_adj.dev_perc.std()
#upper_limit = trimmed_mean + (trimmed_std * rem_beyond_stds)
#lower_limit = trimmed_mean - (trimmed_std * rem_beyond_stds)
lower_limit = task_params['minimum_percent_deviation_to_keep'] # -35
upper_limit = task_params['maximum_percent_deviation_to_keep'] # +20
#df_filt = df[(df.dev_perc <= upper_limit) &
# (df.dev_perc >= lower_limit)]
df['is_outlier'] = False
df.is_outlier = ( (df.dev_perc > upper_limit)
| (df.dev_perc < lower_limit))
#devperc_failure = task_params['min_percentISI_deviation_counted_as_failure']
#return df_filt
return df
In [194]:
def label_shift_ranges(task_taps_df):
'''input: a taps-only df for a single task (all participants).
output: the same df with ranges labeled ('is_range_1a' etc.)
'''
#label slicing is end-inclusive, so don't overlap beat numbers
# one beat removed from the start of each "a" range (a player isn't
# expected to be in synch with the actual perturbed tap, but we start
# measuring when the next one comes.)
# (the '0' range is before all the large shifts. Just placeholders...)
# shift_ranges = {0: {'a': ( 0, 96),
# 'b': ( 0, 96),},
# 1: {'a': ( 98, 104),
# 'b': (105, 113),},
# 2: {'a': (115, 121),
# 'b': (122, 130),},
# 3: {'a': (132, 139),
# 'b': (140, 149),},
# 4: {'a': (151, 159),
# 'b': (160, 169),},
# }
#shifts: ... 97, 114, 131, 150
#get the next four beats
shift_ranges = {0: {'a': ( 0, 96),
'b': ( 0, 96),},
1: {'a': ( 99, 101),
'b': (102, 113),},
2: {'a': (116, 118),
'b': (119, 130),},
3: {'a': (133, 135),
'b': (136, 149),},
4: {'a': (152, 154),
'b': (155, 169),},
}
def getbeatxs(df):
# groupby/apply doesn't seem to be set up well for selecting
# particular rows from each value of a multiindex... here, we'll
# have to remove the 'pid' index explicitly I guess.
df = df.reset_index('pid').drop('pid', axis=1)
for i in [0,1,2,3,4]:
for j in ['a','b']:
srx = shift_ranges[i][j]
label = 'is_range_' + str(i) + j
df[label] = False
df.loc[srx[0]:srx[1], label] = True
label = 'is_shiftedarea'
df[label] = False
df.loc[shift_ranges[1]['a'][0]:shift_ranges[1]['b'][1], label] = True
df.loc[shift_ranges[2]['a'][0]:shift_ranges[2]['b'][1], label] = True
df.loc[shift_ranges[3]['a'][0]:shift_ranges[3]['b'][1], label] = True
df.loc[shift_ranges[4]['a'][0]:shift_ranges[4]['b'][1], label] = True
return df
g = task_taps_df.groupby(level='pid')
return g.apply(getbeatxs)
In [196]:
#taps only (remove "target" data)
db_taps = {t: df.xs('tap', level='stamp')
for (t, df) in task_frames.items()}
phase_tasks = ['Ticks_Phase_5', 'Ticks_Phase_8',
'Jits_Phase_5', 'Jits_Phase_8', ]
for t in phase_tasks:
db_taps[t] = label_shift_ranges(db_taps[t])
print(t + ": shift range labels")
taps_filtered = OrderedDict()
outlier_rem_record = {}
for t in sms_tasknames:
print('\n' + t)
tdata = db_taps[t]
tparams = sms_params[t]
outlier_rem_record[t] = {}
tdata_filt = {}
for pid in task_pids[t]:
print(pid, end=",")
pdata = tdata.xs(pid)
#print(max(pdata.index))
#filters out certain intervals and adds "is_outlier" field
filtered_a = filter_taps(pdata, tparams)
#remove beats based on "is_outlier_ field, as added by filter_taps()
#but don't remove outliers in phase tasks
if t in phase_tasks:
filtered_b = filtered_a
else:
filtered_b = filtered_a[filtered_a.is_outlier != True]
#Need to worry about how to filter the phase tasks later...
tdata_filt[pid] = filtered_b
outlier_rem_record[t][pid] = len(filtered_a) - len(filtered_b)
taps_filtered[t] = pd.concat(tdata_filt, names=['pid'])
mean_rem = round(np.mean(outlier_rem_record[t].values()),1)
std_rem = round(np.std(outlier_rem_record[t].values()),1)
max_rem = max(outlier_rem_record[t].values())
print('\n outlier beats removed per P.: mean={}, sd={}, max={}'
.format(mean_rem, std_rem, max_rem))
print('\n' + '=' * 70)
In [197]:
taps_filtered['Ticks_Phase_5'].tail().T
Out[197]:
In [198]:
def fig_dims(width, factor):
#WIDTH = 350.0 # the number latex spits out
#FACTOR = 0.45 # the fraction of the width you'd like the figure to occupy
fig_width_pt = width * factor
inches_per_pt = 1.0 / 72.27
golden_ratio = (np.sqrt(5) - 1.0) / 2.0 # because it looks good
fig_width_in = fig_width_pt * inches_per_pt # figure width in inches
fig_height_in = fig_width_in * golden_ratio # figure height in inches
fig_dims = [fig_width_in, fig_height_in] # fig dims as a list
return fig_dims
#don't adjust MPL defaults to pandas's preferred defaults
pd.options.display.mpl_style = None
mpl.rcdefaults()
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 14
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['legend.fontsize'] = 12
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
def task_hists(tdata):
figsize = fig_dims(2000, 0.45)
ax = avgtargs.plot(y = 'tinterval', linewidth=2, color='black', figsize=figsize)
avg_tap = avgtargs.tinterval + avgdevs.dev
upper_sd = avg_tap + SD_devs.dev
lower_sd = avg_tap - SD_devs.dev
#upper_sd.plot(y = 'dev', linewidth=3, color='black', linestyle="--")
#lower_sd.plot(y = 'dev', linewidth=3, color='black', linestyle="--")
#ax.plot(upper_sd.dev, linewidth=3, color='black', linestyle="--")
#ax.plot(lower_sd.dev, linewidth=3, color='black', linestyle="--")
avg_tap.plot(linewidth=1, color='black', linestyle="--", dashes=(5,3))
upper_sd.plot(linewidth=1, color='black', linestyle="-", marker="o", markersize=4)
lower_sd.plot(linewidth=1, color='black', linestyle="-", marker="o", markersize=4)
ax.set_ylabel("Milliseconds")
ax.set_xlabel("Interval number")
ax.grid(b=False, which='major', axis='both')
# set number of labeled "ticks" on each axis (overriding auto setting)
#ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(15))
#ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(10))
# (it will sometimes decide to show fewer than this, hence "max")
# Or to be precise:
ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(15))
#ax.xaxis.tick_bottom()
#ax.yaxis.tick_left()
#ax.spines["right"].set_color("none")
#ax.spines["top"].set_color("none")
ax.legend(["Target stimulus interval (TSI)",
"TSI + mean of absolute performance asynchronies",
u"Between-participants variability in absolute performance asynchronies (TSI ± 1 SD)"], loc="best")
ax.get_legend().set_title("")
ax.get_legend().draw_frame(False)
plt.show()
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 14
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['legend.fontsize'] = 12
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
# iso5t1 and iso8t1: Need to remove the extra intervals at the
# end of the task from the first few subs! (after beat 130-ish?)
# (Probably easiest and less confusing for future readers if they're just
# chopped out of the CSV file at the start.)
In [199]:
#don't adjust MPL defaults to pandas's preferred defaults
pd.options.display.mpl_style = None
mpl.rcdefaults()
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 35
rcParams['xtick.labelsize'] = 16
rcParams['ytick.labelsize'] = 16
rcParams['legend.fontsize'] = 16
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
In [200]:
testdf = db_taps[long_name['phase5t']]
testdf[testdf.is_shiftedarea == True]
Out[200]:
In [201]:
tasks_using = sorted([k for k in db_taps.keys() if 'T1_SMS' not in k])
#print(tasks_using)
stack_dev_data = {t: db_taps[t] for t in tasks_using}
for t in tasks_using:
tdatadf = stack_dev_data[t]
if t in phase_tasks:
tdatadf = tdatadf[tdatadf.is_shiftedarea == True]
tdata = tdatadf.dev_perc
len_all = tdata.count()
num_pids = len(tdata.index.get_level_values('pid').unique())
print(short_name[t])
print("i = ", len_all)
print("N = ", num_pids)
plt.figure()
tdata.hist(figsize=(5,5), bins=60, color='white', grid=False, range=(-50,50))
#plt.show()
#plt.tight_layout()
plt.savefig("c:/_Sync/1020_histograms_raw_" + short_name[t] + '2.png',
format='png',
)
#n, bins, patches = plt.hist(db_taps[ 30, stacked=True, normed = True)
#plt.figure()
#plt.hist(stack_dev_data, stacked=True)
#plt.show()
In [203]:
#Phase tasks: portions NOT in a post-phase-shift period
phase_tasks = ['Ticks_Phase_5', 'Ticks_Phase_8', 'Jits_Phase_5', 'Jits_Phase_8']
phase_normal_sections = {}
phase_post_shift_sections = {}
phase_post_shift_all = {}
for t in phase_tasks:
# The only filtering done so far was to remove the
# first 12 beats after participant began tapping:
pdf = taps_filtered[t]
normal_period = pdf[ (pdf.is_range_0a==True)
| (pdf.is_range_1b==True)
| (pdf.is_range_2b==True)
| (pdf.is_range_3b==True)
| (pdf.is_range_4b==True)]
post_shift = pdf[ (pdf.is_range_1a==True)
| (pdf.is_range_2a==True)
| (pdf.is_range_3a==True)
| (pdf.is_range_4a==True)]
post_shift_all = pdf[pdf.is_shiftedarea==True]
phase_normal_sections[t] = normal_period
phase_post_shift_sections[t] = post_shift
phase_post_shift_all[t] = post_shift_all
In [204]:
responses_possible = 65. #intervals possible for each phase task (post-shift regions)
response_count = {}
for t in phase_tasks:
tdata = phase_post_shift_all[t]
for p in task_pids[t]:
responsecount = tdata.dev_perc.xs(p).count()
responsep = responsecount / responses_possible
response_p_dist.append(responsep)
if responsep < 0.9:
print(p, round(responsep,2))
In [207]:
measurement_region_overall_distributions = {'mean': {},
'sd': {},}
print("Before adjustment: \n\n")
for t, df in phase_post_shift_all.items():
print(t)
subs = df.groupby(level='pid').dev_perc
mean_of_means = subs.mean().mean()
mean_of_sds = subs.std().mean()
count = subs.mean().count()
measurement_region_overall_distributions['mean'][t] = mean_of_means
measurement_region_overall_distributions['sd'][t] = mean_of_sds
print('mean: %s' % mean_of_means)
print('sd: %s' % mean_of_sds)
print('n: %s' % count)
print("\n\nAfter adjustment: \n\n")
for t, df in phase_post_shift_all.items():
print(t)
dist_mean = measurement_region_overall_distributions['mean'][t]
dist_sd = measurement_region_overall_distributions['sd'][t]
subs = phase_post_shift_all[t].copy()
for p in task_pids[t]:
devp = subs.xs(p).dev_perc
upper_limit = 20
lower_limit = -35
devp[devp > upper_limit] = upper_limit
devp[devp < lower_limit] = lower_limit
#missing = len(devp) - devp.count()
#if missing > 18:
# print(p, missing)
#subs_replacena = subs.copy()
#subs_replacena[subs_replacena.dev_perc.isnull()] = dist_mean + (1 * dist_sd)
taps_filtered[t + "_mperiod_noreplace"] = subs
taps_filtered[t + "_mperiod_replaced"] = subs
task_pids[t + "_mperiod_noreplace"] = task_pids[t]
task_pids[t + "_mperiod_replaced"] = task_pids[t]
#recalc after adjustments
print(t)
devs = subs_replacena.groupby(level='pid').dev_perc
mean_of_means = devs.mean().mean()
mean_of_sds = devs.std().mean()
count = devs.mean().count()
print('mean: %s' % mean_of_means)
print('sd: %s' % mean_of_sds)
print('n: %s' % count)
In [208]:
post_shift_overall_distributions = {'mean': {},
'sd': {},}
print("Before adjustment: \n\n")
for t, df in phase_post_shift_sections.items():
print(t)
subs = df.groupby(level='pid').dev_perc
mean_of_means = subs.mean().mean()
mean_of_sds = subs.std().mean()
count = subs.mean().count()
post_shift_overall_distributions['mean'][t] = mean_of_means
post_shift_overall_distributions['sd'][t] = mean_of_sds
print('mean: %s' % mean_of_means)
print('sd: %s' % mean_of_sds)
print('n: %s' % count)
print("\n\nAfter adjustment: \n\n")
for t, df in phase_post_shift_sections.items():
dist_mean = post_shift_overall_distributions['mean'][t]
dist_sd = post_shift_overall_distributions['sd'][t]
subs = phase_post_shift_sections[t].copy()
upper_limit = dist_mean + (2 * dist_sd)
lower_limit = dist_mean - (2 * dist_sd)
subs[subs.dev_perc > upper_limit] = upper_limit
subs[subs.dev_perc < lower_limit] = lower_limit
subs_replacena = subs.copy()
subs_replacena[subs_replacena.dev_perc.isnull()] = dist_mean + (1 * dist_sd)
taps_filtered[t + "_postshift_noreplace"] = subs
taps_filtered[t + "_postshift_replaced"] = subs
task_pids[t + "_postshift_noreplace"] = task_pids[t]
task_pids[t + "_postshift_replaced"] = task_pids[t]
#recalc after adjustments
print(t)
devs = subs_replacena.groupby(level='pid').dev_perc
mean_of_means = devs.mean().mean()
mean_of_sds = devs.std().mean()
count = devs.mean().count()
print('mean: %s' % mean_of_means)
print('sd: %s' % mean_of_sds)
print('n: %s' % count)
In [209]:
#treat the sections not just after a phase shift like the other tasks...
#t = 'Ticks_Phase_8'
for t in phase_tasks:
alt_taskname = t + "_normal"
print(alt_taskname)
tparams = sms_params[t]
tdata = phase_normal_sections[t]
tdata_filt = {}
outlier_rem_record[alt_taskname] = {}
for pid in task_pids[t]:
pdata = tdata.xs(pid)
filtered_a = filter_taps(pdata, tparams)
filtered_b = filtered_a[filtered_a.is_outlier != True]
tdata_filt[pid] = filtered_b
outlier_rem_record[t][pid] = len(filtered_a) - len(filtered_b)
taps_filtered[alt_taskname] = pd.concat(tdata_filt, names=['pid'])
task_pids[alt_taskname] = task_pids[t]
mean_rem = round(np.mean(outlier_rem_record[t].values()),1)
std_rem = round(np.std(outlier_rem_record[t].values()),1)
max_rem = max(outlier_rem_record[t].values())
print('\n outlier beats removed per P.: mean={}, sd={}, max={}'
.format(mean_rem, std_rem, max_rem))
print('\n' + '=' * 70)
In [210]:
taps_filtered.keys()
Out[210]:
In [211]:
def sideplots(title, serieslist, namelist, **kwargs):
from matplotlib import pyplot as plt
assert len(serieslist) == len(namelist)
count = len(serieslist)
fig, axes = plt.subplots(nrows=count, ncols=3, **kwargs)
colors=['blue', 'green', 'red', 'white', 'orange']
plots = [(namelist[i], serieslist[i]) for i in range(count)]
for (i, (n, s)) in enumerate(plots):
try:
plot_color = colors[i]
except IndexError:
plot_color = 'blue'
ax_hist = plt.subplot2grid((count, 3), (i, 0), colspan=2)
ax_hist.set_title(n, fontsize=12)
ax_line = plt.subplot2grid((count, 3), (i, 2), colspan=1)
ax_line.set_title(n, fontsize=12)
s.plot(ax=ax_line, linewidth=3, color=plot_color)
s.hist(ax=ax_hist, bins=20, color=plot_color)
fig.suptitle(title, fontsize=22)
plt.show()
#fig.tight_layout()
In [212]:
short_name
Out[212]:
In [174]:
t = long_name['phase5t']
for pid in task_pids[t]:
tdf = phase_post_shift_sections[t].xs(pid) #(unfiltered)
first_non_miss = tdf[tdf.is_failure==False]
first_beat_tapped = min(first_non_miss.index)
n = 12
after_first_n = tdf.ix[first_beat_tapped + n:]
missed_beats_count = len(after_first_n[after_first_n.is_failure==True])
sdf = after_first_n.dev_perc
full_count = len(after_first_n)
filt_df = sdf[(sdf >= -35) & (sdf <= 20)]
filt_count = len(sdf) - len(filt_df)
good_count = len(filt_df)
pct_retained = 100 * good_count / full_count
print(str(pct_retained) + '%')
if pct_retained < 75: print("\n\n\n!!!!\n")
sideplots(title = "P. {} - misses: {} - filtered out: {} - OK: {}".format(pid,
missed_beats_count,
filt_count,
good_count),
serieslist=[sdf, filt_df],
namelist=['raw', 'filtered'],
figsize=(19,6))
#if pid == '020': break
In [20]:
#iso5t2: -30 to 30 looks good
#iso8t2:
#p. 49 mostly halfway between beats sometimes...
#p. 55 consistently tapping halfway between beats
#lin5t:
#p. 089 switches to half-beat tapping for the last portion of the trial
#switching to +/- 25%
#lin8t:
#p. 073 used half-taps
#iso5j (jitters):
#switching to -35 to +20
#pid='015'
In [151]:
t = long_name['iso8j']
for pid in task_pids[t]:
tdf = db_taps[t].xs(pid) #(unfiltered)
first_non_miss = tdf[tdf.is_failure==False]
first_beat_tapped = min(first_non_miss.index)
n = 12
after_first_n = tdf.ix[first_beat_tapped + n:]
missed_beats_count = len(after_first_n[after_first_n.is_failure==True])
sdf = after_first_n.dev_perc
full_count = len(after_first_n)
#filt_df = sdf[(sdf >= -35) & (sdf <= 20)]
filt_df = taps_filtered[t].xs(pid).dev_perc
filt_count = len(sdf) - len(filt_df)
good_count = len(filt_df)
pct_retained = 100 * good_count / full_count
print(str(pct_retained) + '%')
if pct_retained < 75: print("\n\n\n!!!!\n")
sideplots(title = "P. {} - misses: {} - filtered out: {} - OK: {}".format(pid,
missed_beats_count,
filt_count,
good_count),
serieslist=[sdf, filt_df],
namelist=['raw', 'filtered'],
figsize=(19,6))
#if pid == '020': break
In [214]:
insufficient_data_pids = {}
minimum_prop = 0.70 # must have at least 70% of the the data non-missed
# and in the keepable range (-35 to 20 percent deviation)
data_proportion = {}
# Three participants (49, 55, 73) have already been removed due to an earlier
# interation of this analysis that showed that they each had fewer than half
# of the tasks meeting the 70% qualifying data criterion.
for t in sms_tasknames:
print(t)
devs = taps_filtered[t].groupby(level='pid').dev_perc
task_beats_max = devs.count().max()
print('Max beats: %s' % task_beats_max)
proportion = devs.count() / task_beats_max
below_cutoff = proportion[proportion < minimum_prop]
print(below_cutoff)
In [218]:
short_name
Out[218]:
In [219]:
#From arduino apparatus code:
#define LINEAR_800_STARTING_ISI 820 //
#define LINEAR_800_PCHANGE_EVERY 5 // decrease by 10 ms every five intervals (avg. -2ms per interval)
#define LINEAR_800_PCHANGE_AMOUNT -10
#define LINEAR_500_PCHANGE_EVERY 5 // increase by 10 ms every five intervals (avg. +2ms per interval)
#define LINEAR_500_PCHANGE_AMOUNT 10
#define LINEAR_500_STARTING_ISI 480 //
#tasks go from beat 0 to beat 169: start at 820, end at [820 - (165 * 10 / 5)] = 490
# start at 480, end at [480 + (165 * 10 / 5)] = 810
#Splitting into thirds (by interval count, not time duration!):
linear_tasks = ['Ticks_Linear_5', 'Ticks_Linear_8',
'Jits_Linear_5', 'Jits_Linear_8']
linear_part_taps = OrderedDict()
linear_part_dfs = OrderedDict()
for t in linear_tasks:
subtask_name_A = t + "ptA"
subtask_name_B = t + "ptB"
subtask_name_C = t + "ptC"
task_pids[subtask_name_A] = task_pids[t]
task_pids[subtask_name_B] = task_pids[t]
task_pids[subtask_name_C] = task_pids[t]
linear_part_taps[subtask_name_A] = {}
linear_part_taps[subtask_name_B] = {}
linear_part_taps[subtask_name_C] = {}
for p in task_pids[t]:
#ex_task_5 = db_taps[long_name['lin5t']].xs(p)
#ex_task_8 = db_taps[long_name['lin8t']].xs(p)
#first_part_800 = ex_task_8[10:65] # (10 to 64) --> 800ms to 700ms
#second_part_800 = ex_task_8[65:110] # (65 to 109) --> 690ms to 610ms
#third_part_800 = ex_task_8[110:165] # (110 to 164) --> 600ms to 500ms
#first_part_500 = ex_task_5[10:65] # (10 to 64) --> 500ms to 600ms
#second_part_500 = ex_task_5[65:110] # (65 to 109) --> 610ms to 690ms
#third_part_500 = ex_task_5[110:165] # (110 to 164) --> 590ms to 500ms
#but it comes out the same for all tasks:
taps = taps_filtered[t].xs(p)
first_part = taps[10:65]
second_part = taps[65:110]
third_part = taps[110:165]
linear_part_taps[subtask_name_A][p] = first_part
linear_part_taps[subtask_name_B][p] = second_part
linear_part_taps[subtask_name_C][p] = third_part
linear_part_names = linear_part_taps.keys()
# If we want to add this to the DFO, we'll need to translate the dict to a pd.concat set of pids.
#for (far, df) in linear_part_taps.items():
# taps_filtered[var] = df
#linear_part_taps[t + "ptB"]['110'].dev_perc.plot(figsize=(5,2))
#plt.show()
#---------------------
#sub_tasks = linear_part_taps.keys()
reshaped = {'Tick_Lin_500600': OrderedDict(),
'Tick_Lin_610690': OrderedDict(),
'Tick_Lin_700800': OrderedDict(),
'Jit_Lin_500600': OrderedDict(),
'Jit_Lin_610690': OrderedDict(),
'Jit_Lin_700800': OrderedDict(),
}
lin_tick_both = sorted(set(task_pids['Ticks_Linear_5'])
.intersection(task_pids['Ticks_Linear_8']))
lin_jits_both = sorted(set(task_pids['Jits_Linear_5'])
.intersection(task_pids['Jits_Linear_8']))
for p in lin_tick_both:
t500600 = pd.concat(axis=0, objs = [linear_part_taps['Ticks_Linear_5ptA'][p],
linear_part_taps['Ticks_Linear_8ptC'][p]])
t610690 = pd.concat(axis=0, objs = [linear_part_taps['Ticks_Linear_5ptB'][p],
linear_part_taps['Ticks_Linear_8ptB'][p]])
t700800 = pd.concat(axis=0, objs = [linear_part_taps['Ticks_Linear_5ptC'][p],
linear_part_taps['Ticks_Linear_8ptA'][p]])
reshaped['Tick_Lin_500600'][p] = t500600
reshaped['Tick_Lin_610690'][p] = t610690
reshaped['Tick_Lin_700800'][p] = t700800
for p in lin_jits_both:
j500600 = pd.concat(axis=0, objs = [linear_part_taps['Jits_Linear_5ptA'][p],
linear_part_taps['Jits_Linear_8ptC'][p]])
j610690 = pd.concat(axis=0, objs = [linear_part_taps['Jits_Linear_5ptB'][p],
linear_part_taps['Jits_Linear_8ptB'][p]])
j700800 = pd.concat(axis=0, objs = [linear_part_taps['Jits_Linear_5ptC'][p],
linear_part_taps['Jits_Linear_8ptA'][p]])
reshaped['Jit_Lin_500600'][p] = j500600
reshaped['Jit_Lin_610690'][p] = j610690
reshaped['Jit_Lin_700800'][p] = j700800
for t in ['Tick_Lin_500600', 'Tick_Lin_610690', 'Tick_Lin_700800']:
task_pids[t] = lin_tick_both
for t in ['Jit_Lin_500600', 'Jit_Lin_610690', 'Jit_Lin_700800']:
task_pids[t] = lin_jits_both
reshaped_dfs = OrderedDict()
for (taskname, task_p_dict) in reshaped.items():
print(taskname)
pids_df = pd.concat(axis=0, objs=task_p_dict.values(), keys=task_p_dict.keys(), names=['pid'])
reshaped_dfs[taskname] = pids_df
for (var, df) in reshaped_dfs.items():
taps_filtered[var] = df
#####
reshaped_dfs['Tick_Lin_700800'][::1000]
Out[219]:
In [68]:
taps_filtered['Tick_Lin_610690']
Out[68]:
In [222]:
#taps_filtered.keys()
short_name['Jits_Phase_8_postshift_noreplace'] = 'phase8j_psk' #post shift, kept nans
short_name['Jits_Phase_8_postshift_replaced'] = 'phase8j_psr' #post shift, replaced nans
short_name['Ticks_Phase_8_postshift_noreplace'] = 'phase8tp_psk'
short_name['Ticks_Phase_8_postshift_replaced'] = 'phase8t_psr'
short_name['Jits_Phase_5_postshift_noreplace'] = 'phase5j_psk'
short_name['Jits_Phase_5_postshift_replaced'] = 'phase5j_psr'
short_name['Ticks_Phase_5_postshift_noreplace'] = 'phase5t_psk'
short_name['Ticks_Phase_5_postshift_replaced'] = 'phase5t_psr'
short_name['Jits_Phase_8_mperiod_noreplace'] = 'phase8j_mpk' #post shift, kept nans
short_name['Jits_Phase_8_mperiod_replaced'] = 'phase8j_mpr' #post shift, replaced nans
short_name['Ticks_Phase_8_mperiod_noreplace'] = 'phase8tp_mpk'
short_name['Ticks_Phase_8_mperiod_replaced'] = 'phase8t_mpr'
short_name['Jits_Phase_5_mperiod_noreplace'] = 'phase5j_mpk'
short_name['Jits_Phase_5_mperiod_replaced'] = 'phase5j_mpr'
short_name['Ticks_Phase_5_mperiod_noreplace'] = 'phase5t_mpk'
short_name['Ticks_Phase_5_mperiod_replaced'] = 'phase5t_mpr'
short_name['Ticks_Phase_5_normal'] = 'phase5t_nrm'
short_name['Ticks_Phase_8_normal'] = 'phase8t_nrm'
short_name['Jits_Phase_5_normal'] = 'phase5j_nrm'
short_name['Jits_Phase_8_normal'] = 'phase8j_nrm'
short_name['Tick_Lin_500600'] = 'lint_500600'
short_name['Tick_Lin_610690'] = 'lint_610690'
short_name['Tick_Lin_700800'] = 'lint_700800'
short_name['Jit_Lin_500600'] = 'linj_500600'
short_name['Jit_Lin_610690'] = 'linj_610690'
short_name['Jit_Lin_700800'] = 'linj_700800'
In [223]:
tasknames = task_frames.keys()
# create a dataframe with an index for every PID that is represented
# in at least one of the tasks' data
pids_all_sms = set()
for (t, df) in task_frames.items():
pids = df.index.get_level_values('pid').unique()
pids_not_excl = [p for p in pids if p not in excluded_all_tasks]
pids_all_sms = pids_all_sms.union(pids_not_excl)
dfo = pd.DataFrame(index = sorted(pids_all_sms))
dfo.index.name = 'pid'
# Iterate through all combinations of tasks, measures, statistics,
# participants. (the valid PIDs are different between tasks, so
# this selects the correct pids. But it might be more elegant to use an
# all-possible-combinations function across all four lists, and handle
# an exception when a PID isn't represented in a given task's data.)
def stat_combo_gen(tasks, measures, statfuncs, statkwargs):
kwargs_all = {k: {} for k in statfuncs} #default, no kwargs
for k, v in statkwargs.items():
kwargs_all[k] = v
for task in tasks:
for measure in measures:
for statfunc in statfuncs:
for p in task_pids[task]:
yield (task, measure, statfunc, kwargs_all, p)
filt_tasknames = taps_filtered.keys()
#stat_combos = stat_combo_gen(tasks = tasknames, # ['T1_SMS_8'],
stat_combos = stat_combo_gen(tasks = filt_tasknames, # ['T1_SMS_8'],
measures = ['dev_perc', 'dev', 'ints'],
statfuncs = ['mean', 'std', 'count'],
statkwargs = {'std': {'ddof': 1}}
)
for (task, measure, statfunc, kwargs, p) in stat_combos:
#print((task, measure, statfunc, p))
ptaps = taps_filtered[task].xs(p, level='pid')
mseries = ptaps[measure]
statfunction = getattr(mseries, statfunc)
result = statfunction(**kwargs[statfunc])
output_varname = '_'.join([short_name[task], measure, statfunc])
output_varname = output_varname.replace("dev_perc_mean", "DPm")
output_varname = output_varname.replace("dev_perc_std", "DPsd")
output_varname = output_varname.replace("dev_perc_count", "DPct")
if output_varname not in dfo:
print(output_varname, end=", ")
dfo[output_varname] = np.nan
dfo[output_varname].loc[p] = result
In [224]:
dfo.T
# We need a good way of dealing with missed beats in the post-shift periods.
# We want missed beats to "count against" the participant there.
# So: we'll use an outlier criterion for truncating (not removing) here, and
# but we'll apply it identically across participants (rather than relative to
# individual participants' overall variability). When a tap is missed, we
# apply the maximum (truncation) value, so that a missed tap is as bad as the
# worst inaccuracy that's kept on record for participants.
Out[224]:
In [225]:
dfo_output_updated = '2014-10-20a'
prefix = "c:/db_pickles/pickle - dfo-sms - "
import cPickle as pickle
output_file= prefix + dfo_output_updated + '.pickle'
pickle.dump(dfo, open(output_file, "wb"))
# Proceed with pickle to Part 5
dfo.head(8).T
Out[225]:
In [239]:
#st = 'Ticks_Linear_5ptB'
for st in ['Ticks_Linear_5ptA',
'Ticks_Linear_5ptB',
'Ticks_Linear_5ptC', ]#linear_part_names:
print('\n\n %s \n===================\n' % st)
task_taps = linear_part_taps[st]
for p in task_pids[st]:
print(p)
taps = task_taps[p].dev_perc
print('\t unfiltered')
taps.hist(figsize=(5,1.2), color='blue')
plt.show() #remove this to plot both in the same figure
print('\t filtered')
filt = taps[(taps <= 20) & (taps >= -35)]
try:
filt.hist(figsize=(5,1.2), color='green')
plt.show()
except ValueError:
print("ZERO SIZE, CANNOT PLOT")
#if p=="025": break
In [113]:
#NO LONGER USING THIS-- SECTIONS ARE COLLAPSED TOGETHER ABOVE
# First pass: determine the truncation value for each task. What's
# z = 2.97 for all taps across all participants within a given
# section of a given task?
phase_tasks = ['Ticks_Phase_5', 'Ticks_Phase_8',
'Jits_Phase_5', 'Jits_Phase_8', ]
psection_list = ['0a', #0b isn't a separate section
'1a', '1b', '2a', '2b',
'3a', '3b', '4a', '4b']
trunc_value = {t: {} for t in phase_tasks}
reindexed_sections = {t: {} for t in phase_tasks}
reindexed_stacked = {}
for t in phase_tasks:
tdata = taps_filtered[t]
#not splitting by participant-- calc across all p's
for section in psection_list:
#print(section)
section_ident_column = 'is_range_' + section
section_taps = tdata[tdata[section_ident_column]==True]
#print(section_taps.head())
section_dps = section_taps.dev_perc
#print(section_dps.count())
#print(section_dps.std())
section_mean = section_dps.mean()
section_sd = section_dps.std()
trunc = {'upper': section_mean + 2.97 * section_sd,
'lower': section_mean - 2.97 * section_sd}
trunc_value[t][section] = trunc
print("{}, {}: <{}, >{}".format(t, section,
trunc['lower'], trunc['upper']))
#fill in all beat values (they were skipped when we built
# this dataframe, so there aren't any NaN values in place yet)
sec_start = section_dps.index.get_level_values('beat').min()
sec_end = section_dps.index.get_level_values('beat').max()
dps = {pid: section_dps.xs(pid) for pid in task_pids[t]}
p_rows = pd.DataFrame(dps)
p_cols = p_rows.T
dps_reindexed = p_cols.reindex(columns=range(sec_start,sec_end+1))
dps_reindexed.index.name = 'pid'
reindexed_sections[t][section] = dps_reindexed
#Not actually doing this yet: just looking at what happens to the values
# in each task based on the trunc values we just found.
stacked = dps_reindexed.stack(dropna=False)
print('trunc <: %s' % stacked[stacked < trunc['lower']].count())
print('trunc >: %s' % stacked[stacked > trunc['upper']].count())
print('blank: %s' % len(stacked[stacked.isnull()])) #count() excludes NaNs!
#print(section.isnull().count())
In [ ]:
phase_tasks = ['Ticks_Phase_5', 'Ticks_Phase_8',
'Jits_Phase_5', 'Jits_Phase_8', ]
psection_list = ['0a', #0b isn't a separate section
'1a', '1b', '2a', '2b',
'3a', '3b', '4a', '4b']
for task in phase_tasks:
print(task)
task_sections = reindexed_sections[task]
for section in psection_list:
print(section)
truncs = trunc_value[task][section]
for pid in task_pids[task]:
print(pid, end=':')
ptaps = task_sections[section].loc[pid]
# only measure we're using is dev_perc now, since that's
#Truncate outliers and impute missing beats with truncation value
# (randomly select upper or lower truncation value, I guess?)
def trunc_and_replace_nans(i):
#def pick_trunc_side():
# import random
# r = random.randint(0, 1)
# if r==1:
# return truncs['upper']
# else:
# return truncs['lower']
if i > truncs['upper']:
i = truncs['upper']
elif i < truncs['lower']:
i = truncs['lower']
elif np.isnan(i):
i = truncs[prev_trunc]
#avoid randomization.... was pick_trunc_side()
return i
stats_before = (ptaps.max(), ptaps.min(), ptaps.mean())
prev_trunc = 'upper' #default
ptaps_post = ptaps.apply(trunc_and_replace_nans)
stats_after = (ptaps_post.max(), ptaps_post.min(), ptaps_post.mean())
if stats_before==stats_after:
print("no change: {:f} {:f} {:f}"
.format(stats_before[0], stats_before[1], stats_before[2]))
else:
print("changed. before: {:f} {:f} {:f}, after: {:f} {:f} {:f}"
.format(stats_before[0], stats_before[1], stats_before[2],
stats_after[0], stats_after[1], stats_after[2]))
result_m = ptaps_post.mean()
result_sd = ptaps_post.std()
result_ct_pre = ptaps.count() #AFTER replacements!
result_ct_post = ptaps_post.count() #AFTER replacements!
task_sec_name = '_'.join([short_name[task], 's' + section])
output_varname_m = task_sec_name + '_DPm'
output_varname_sd = task_sec_name + '_DPsd'
output_varname_ct_pre = task_sec_name + '_DPctPre'
output_varname_ct_post = task_sec_name + '_DPctPost'
if output_varname_m not in dfo:
dfo[output_varname_m] = np.nan
print('added varname: %s' % output_varname_m)
if output_varname_sd not in dfo:
dfo[output_varname_sd] = np.nan
print('added varname: %s' % output_varname_sd)
if output_varname_ct_pre not in dfo:
dfo[output_varname_ct_pre] = np.nan
print('added varname: %s' % output_varname_ct_pre)
if output_varname_ct_post not in dfo:
dfo[output_varname_ct_post] = np.nan
print('added varname: %s' % output_varname_ct_post)
dfo[output_varname_m].loc[pid] = result_m
dfo[output_varname_sd].loc[pid] = result_sd
dfo[output_varname_ct_pre].loc[pid] = output_varname_ct_pre
dfo[output_varname_ct_post].loc[pid] = output_varname_ct_post
print()
print()
print()
#change output: max, min, mean
In [116]:
dfo.T
Out[116]:
In [227]:
#sms_tasknames = ['Ticks_Linear_5',
# 'Ticks_Linear_8',
# 'Ticks_Phase_5',
# 'Ticks_Phase_8',
# 'T1_SMS_5',
# 'Ticks_ISO_T2_5',
# 'T1_SMS_8',
# 'Ticks_ISO_T2_8',]
def cool_sms_plot(participant_task_df, ISI):
from pandas.stats.moments import rolling_mean
lb_test = participant_task_df.xs('target', level='stamp')
tap_test = participant_task_df.xs('tap', level='stamp')
lb_test.plot(x = 'task_ms', y = 'tinterval', figsize=(13,7), linewidth=4)
tap_test['dev_relative'] = tap_test.dev + ISI
tap_test.plot(x = 'task_ms', y = 'dev_relative', marker="o", linewidth=0) #, figsize=(14,9))
tap_test['rmean'] = rolling_mean(tap_test.dev_relative, window=10, center=True, min_periods=2)
tap_test.plot(x = 'task_ms', y = 'rmean', linewidth=2)
#print(tap_test.dev)
#need to do this in MPL directly so I can return a plot variable
#cool_sms_plot(sms_tasknames[0], '102')
#cool_sms_plot('Ticks_ISO_T2_5', '103')
#could use this in manuscript to demonstrate pre- and post-filtered data...
task = long_name['phase8j']
pid = '116'
print(pid)
cool_sms_plot(task_frames[task].xs(pid), 800)
In [229]:
def fig_dims(width, factor):
#WIDTH = 350.0 # the number latex spits out
#FACTOR = 0.45 # the fraction of the width you'd like the figure to occupy
fig_width_pt = width * factor
inches_per_pt = 1.0 / 72.27
golden_ratio = (np.sqrt(5) - 1.0) / 2.0 # because it looks good
fig_width_in = fig_width_pt * inches_per_pt # figure width in inches
fig_height_in = fig_width_in * golden_ratio # figure height in inches
fig_dims = [fig_width_in, fig_height_in] # fig dims as a list
return fig_dims
def task_variability(taskname):
#tdata = db_taps[taskname]
tdata = taps_filtered[taskname]
#plt.suptitle(short_name[taskname])
ISI = sms_params[taskname]['ISI']
if ISI == '(varies)': ISI = 650
avgdevs = tdata.mean(level='beat')[5:]
SD_devs = tdata.std(level='beat')[5:]
avgtargs = task_frames[taskname].xs('target',level='stamp').mean(level='beat')
figsize = fig_dims(2000, 0.45)
ax = avgtargs.plot(y = 'tinterval', linewidth=2, color='black', figsize=figsize)
#ax.plot(avgdevs)
adjust_avgdevs = avgdevs + ISI
#adjust_avgdevs.plot(y = 'dev', linewidth=2)
#upper_sd = adjust_avgdevs + (SD_devs)
#lower_sd = adjust_avgdevs - (SD_devs)
#plt.fill_between(x='task_ms', y1=upper_sd.dev, y2=lower_sd.dev, color='grey', alpha='0.5')
#upper_sd.plot(y = 'dev', linewidth=1)
#lower_sd.plot(y = 'dev', linewidth=1)
#upper_sd = ISI + SD_devs
#lower_sd = ISI - SD_devs
avg_tap = avgtargs.tinterval + avgdevs.dev
upper_sd = avg_tap + SD_devs.dev
lower_sd = avg_tap - SD_devs.dev
#upper_sd.plot(y = 'dev', linewidth=3, color='black', linestyle="--")
#lower_sd.plot(y = 'dev', linewidth=3, color='black', linestyle="--")
#ax.plot(upper_sd.dev, linewidth=3, color='black', linestyle="--")
#ax.plot(lower_sd.dev, linewidth=3, color='black', linestyle="--")
avg_tap.plot(linewidth=1, color='black', linestyle="--", dashes=(5,3))
upper_sd.plot(linewidth=1, color='black', linestyle="-", marker="o", markersize=4)
lower_sd.plot(linewidth=1, color='black', linestyle="-", marker="o", markersize=4)
ax.set_ylabel("Milliseconds")
ax.set_xlabel("Interval number")
ax.grid(b=False, which='major', axis='both')
# set number of labeled "ticks" on each axis (overriding auto setting)
ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(15))
ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(10))
# (it will sometimes decide to show fewer than this, hence "max")
# Or to be precise:
ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(15))
#ax.xaxis.tick_bottom()
#ax.yaxis.tick_left()
#ax.spines["right"].set_color("none")
#ax.spines["top"].set_color("none")
ax.legend(["Target IOI",
"IOI + mean of absolute asynchrony values",
u"Between-participants variability in mean asynchrony (IOI ± 1 SD)"], loc="best")
ax.get_legend().set_title("")
ax.get_legend().draw_frame(False)
plt.savefig("c:/_Sync/1020a_postfilt_varlines_" + short_name[t] + '.png',
format='png',
)
plt.show()
#don't adjust MPL defaults to pandas's preferred defaults
pd.options.display.mpl_style = None
mpl.rcdefaults()
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 14
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['legend.fontsize'] = 12
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
for t in sms_tasknames:
print(short_name[t])
print("N = ", len(db_taps[t].index.get_level_values('pid').unique()))
task_variability(t) #long_name['iso5t1'])
#break
# iso5t1 and iso8t1: Need to remove the extra intervals at the
# end of the task from the first few subs! (after beat 130-ish?)
# (Probably easiest and less confusing for future readers if they're just
# chopped out of the CSV file at the start.)
In [31]:
mpl.rcdefaults()
rcParams #.keys()
#avgdevs = db_taps[long_name['phase8t']].mean(level='beat').dev
Out[31]:
In [102]:
def plot_settings():
#don't adjust MPL defaults to pandas's preferred defaults
pd.options.display.mpl_style = None
mpl.rcdefaults()
from matplotlib import rcParams
#rcParams['axes.titlesize'] = 22
rcParams['font.size'] = 14
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['legend.fontsize'] = 12
rcParams['font.family'] = 'serif'
rcParams['figure.facecolor'] = '1.0' # 0 black --> 1 white; grays
plot_settings()
phase5t = db_taps[long_name['phase5t']]
phase8t = db_taps[long_name['phase5t']]
avgdevs = {5: phase5t.mean(level='beat')[5:],
8: phase8t.mean(level='beat')[5:], }
sd_devs = {5: phase5t.std(level='beat')[5:],
8: phase8t.std(level='beat')[5:], }
sd_devs[5].dev_perc.plot(color='red')
for shift_beat in [97, 114, 131, 150]:
plt.axvline(x=shift_beat, color='black', ymin=0, ymax=1.0, linewidth=1)
In [24]:
import re
def col_find(df, regex):
cols = list(enumerate(df.columns))
matches = [#'%d. %s' %
(i, c)
for (i, c) in cols
#if filt in c
if re.findall(regex, c)
]
#print('\n'.join(matches))
return matches
#filt = r"(^J)(.*)(d$)"
#cf = col_find(dfo, filt)
#import itertools
#list(itertools.combinations(cf, 2))
In [25]:
def inverse_scatter(dfo, ilocx, ilocy, *args, **kwargs):
inversed = lambda df: 1.0/df
df_temp = pd.concat([inversed(dfo.T.iloc[ilocx]),
inversed(dfo.T.iloc[ilocy])],
axis=1)
df_temp.plot(x=0,y=1, kind='scatter', **kwargs)
plt.show()
print('r = %f' % df_temp.corr().iloc[0,1])
def inverse_correl(dfo, ilocx, ilocy, **kwargs):
inversed = lambda df: 1.0/df
df_temp = pd.concat([inversed(dfo.T.iloc[ilocx]),
inversed(dfo.T.iloc[ilocy])],
axis=1)
print('r = %f' % df_temp.corr().iloc[0,1])
inverse_scatter(dfo, 73, 79, figsize=(5,5))
In [26]:
#NEW DATA VERSION....
percstds = col_find(dfo, r'.*perc_std')
import itertools
pairs = list(itertools.combinations(percstds, 2))
pair_nums = [(x[0], y[0]) for x, y in pairs]
for (x, y) in pair_nums:
inverse_scatter(dfo, x, y, figsize=(5, 5))
In [29]:
# original scale plots for comparison
def scatter_by_colnum(dfo, ilocx, ilocy, *args, **kwargs):
#inversed = lambda df: 1.0/df
df_temp = pd.concat([dfo.T.iloc[ilocx],
dfo.T.iloc[ilocy]],
axis=1)
df_temp.plot(x=0,y=1, kind='scatter', **kwargs)
plt.show()
print('r = %f' % df_temp.corr().iloc[0,1])
import itertools
pairs = list(itertools.combinations(percstds, 2))
pair_nums = [(x[0], y[0]) for x, y in pairs]
for (x, y) in pair_nums:
scatter_by_colnum(dfo, x, y, figsize=(5, 5))
In [26]:
def sideplots(series_top, series_bottom,
plotname_top="pre-filter",
plotname_bottom="post-filter"):
from matplotlib import pyplot as plt
fig, axes = plt.subplots(nrows=2, ncols=3)
fig.set_figheight(7)
fig.set_figwidth(15)
#fig.suptitle('t', fontsize=25)
#plt.xlabel('xlabel', fontsize=18)
#plt.ylabel('ylabel', fontsize=16)
ax1 = plt.subplot2grid((2,3), (0,0), colspan=2)
ax2 = plt.subplot2grid((2,3), (1,0), colspan=2)
ax3 = plt.subplot2grid((2,3), (0, 2)) #, rowspan=2)
ax4 = plt.subplot2grid((2,3), (1, 2))
#ax5 = plt.subplot2grid((4,4), (2, 1))
ax1.set_title(plotname_top, fontsize=16)
ax2.set_title(plotname_bottom, fontsize=16)
ax3.set_title(plotname_top, fontsize=16)
ax4.set_title(plotname_bottom, fontsize=16)
#ax5.set_title('ax5 title', fontsize=35)
# series_l.plot(ax=axes[0,0], linewidth=3)
# series_r.plot(ax=axes[0,1], linewidth=3)
# series_l.hist(ax=axes[1,0])
# series_r.hist(ax=axes[1,1])
series_top.plot(ax=ax1, linewidth=3)
series_bottom.plot(ax=ax2, linewidth=3)
series_top.hist(ax=ax3, bins=20)
series_bottom.hist(ax=ax4, bins=20)
fig.tight_layout()
In [27]:
#not using this at the moment
test_params = {'ISI': 500,
'filter_outliers_beyond_x_stdevs': 3,
#'min_percentISI_deviation_counted_as_failure': 40,
'stdev_calcs_exclude_n_from_left': 2,
'stdev_calcs_exclude_n_from_right': 2,
#'stimulus_style': 'tick',
#'stimulus_timing': 'iso',
'wait_beats_after_subj_start': 6,
'wait_beats_after_task_start': 9, }
#test_task = long_name['iso5t2'] #'Ticks_ISO_T2_5'
#test_pids = ['101', '102', '103', '104', '105', '107']
def before_after_plots(taskname, pid):
unfilt = db_taps[taskname].xs(pid)
filtered = filter_taps(unfilt, task_params=sms_params[taskname])
print("{0} taps ==> {1} taps".format(len(unfilt), len(filtered)))
#fig = sideplots(test_taps.dev_perc, filtered.dev_perc)
outliers_removed = filtered[filtered.is_outlier != True]
print('pre-filter:\t' +
'sd= {sd} \t mean= {mean} \t md= {md}'.format(sd=unfilt.dev_perc.std(),
mean=unfilt.dev_perc.mean(),
md=unfilt.dev_perc.median()))
print('post-filter:\t' +
'sd= {sd} \t mean= {mean} \t md= {md}'.format(sd=filtered.dev_perc.std(),
mean=filtered.dev_perc.mean(),
md=filtered.dev_perc.median()))
fig = sideplots(unfilt.dev_perc,
outliers_removed.dev_perc,
plotname_top = "%s, P. %s, pre-filter" % (short_name[taskname], pid),
plotname_bottom = "P. {}, post-filter".format(pid))
#plt.show()
return fig
def too_many_plots(**kwargs):
gen_tasks_pids = general_task_pid_iterator(**kwargs)
for t, pid in gen_tasks_pids:
#plt.figure()
fig = before_after_plots(t, pid)
#plt.show()
yield fig
next_plot = too_many_plots()
for i in range(2):
# plt.figure()
# fig = next_plot.next()
plt.show(next_plot.next())
#skip_to = ('T1_SMS_8', '115')
#start_later = too_many_plots(skip_to_task='T1_SMS_8', skip_to_pid='080')
#
#for i in range(2):
# plt.figure()
# fig = start_later.next()
# plt.show()
In [24]:
from matplotlib.backends.backend_pdf import PdfPages
pp = PdfPages('C:/db_pickles/multipage-big.pdf')
tpid_plots = too_many_plots()
#for i in tpid_plots:
plotgrid = next_plot.next()
pp.savefig(plotgrid)
plt.close() #prevents output from displaying to user
else:
pp.close()
In [40]:
import psutil
free_megs = psutil.virtual_memory()[1] / 1000000
free_megs
Out[40]:
In [ ]:
#separate files for tasks
from matplotlib.backends.backend_pdf import PdfPages
iterator = general_task_pid_iterator(concise_labels=False,
skip_to_task='Ticks_Phase_8')
prev_t = None
pp = PdfPages('C:/db_pickles/empty.pdf')
while True:
try:
t, pid = iterator.next()
except StopIteration:
pp.close()
break
if prev_t != t:
pp.close()
pp = PdfPages('C:/db_pickles/sideplots - 2014-09-23b2 - %s.pdf' % short_name[t])
print("{} megs free memory".format(psutil.virtual_memory()[1] / 1000000))
fig = before_after_plots(t, pid)
pp.savefig(fig)
plt.close()
prev_t = t
pp.close()
In [37]:
def task_side_plotter_pdf(task_name):
from matplotlib.backends.backend_pdf import PdfPages
t = task_name
file_out = 'C:/db_pickles/sideplots - 2014-09-26c1 - %s.pdf' % short_name[t]
with PdfPages(file_out) as pp:
for pid in task_pids[task_name]:
print("{} megs free memory".format(psutil.virtual_memory()[1] / 1000000))
fig = before_after_plots(t, pid)
pp.savefig(fig)
plt.close()
pp.close()
for t in sms_tasknames:
task_side_plotter_pdf(t)
In [38]:
for i in general_task_pid_iterator(concise_labels=True):
pass
In [260]:
# OUTLIER LABELING - removed for now
adjusted_devperc_mean = rem_worst.mean()
taps['outlier_metric'] = taps.dev_perc / adjusted_devperc_mean
devperc_limit = rem_worst.std() * rem_beyond_stds
taps['is_outlier'] = ( (taps.dev_perc - adjusted_devperc_mean > devperc_limit)
| (taps.dev_perc - adjusted_devperc_mean < -1 * devperc_limit) )
if print_results:
print('worst deviations (% of ISI):')
print(list(temp_devperc_ordered[:nworst_left]))
print(list(temp_devperc_ordered[-1 * nworst_right:]))
print('original mean: %s' % round(taps.dev_perc.mean(), 1))
print('adjusted mean: %s' % round(adjusted_devperc_mean), 1)
print('original stdv: %s' % round(taps.dev_perc.std(), 1))
print('adjusted stdv: %s' % round(rem_worst.std(), 1))
outliers = taps[taps.is_outlier]
if len(outliers) > 0:
print('outlier deviations (% of ISI):')
print(list(outliers.dev_perc.round(decimals=2)))
else:
print('No outliers.')
taps.set_index('beat', inplace=True)
return taps
In [3]:
#filtering would have to take place here, since we aren't assigning "outliers"
#in the earlier processing steps anymore
db_taps_filt = {t: df[df.is_outlier==False]
for (t, df) in db_taps.items()}
In [4]:
db_taps_filt['T1_SMS_5']
Out[4]:
In [195]:
task_frames.keys()
Out[195]:
In [196]:
# Goofing off
def list_summary(ls, head=5, tail=5):
lhead = list(ls[:head])
ltail = list(ls[-tail:])
return ' '.join(lhead + ['...'] + ltail)
def tabbed_dict(d):
maxlength = max([len(h) for h in d.keys()])
set_tabs = 1 + maxlength//8
outd = {}
for k, v in d.items():
add_tabs = set_tabs - len(k)//8
outk = k + '\t' * add_tabs
outd[outk] = v
return outd
for k, v in tabbed_dict(task_pids).items():
print(k + list_summary(v))
In [197]:
def tapxs(df):
return df.xs('tap', level='stamp')
def tapxsp(df, pid):
return (df.xs(pid, level='pid')
.xs('tap', level='stamp'))
def db_ptap(task, pid):
return (task_frames[task].xs(pid, level='pid')
.xs('tap', level='stamp'))
In [199]:
df = task_frames[t].xs('098') #.xs('tap', level='stamp')
df[-30:] #.sort().iloc[-30:]
Out[199]:
In [206]:
import re
def col_find(df, regex):
cols = list(enumerate(df.columns))
matches = [#'%d. %s' %
(i, c)
for (i, c) in cols
#if filt in c
if re.findall(regex, c)
]
#print('\n'.join(matches))
return matches
filt = r"(^J)(.*)(d$)"
cf = col_find(dfo, filt)
import itertools
list(itertools.combinations(cf, 2))
Out[206]:
In [213]:
#df_X = dfo.T.iloc[49] #'Jits_ISO_5_dev_perc_std'
#df_Y = dfo.T.iloc[55] #'Jits_ISO_8_dev_perc_std'
def inverse_scatter(dfo, ilocx, ilocy, *args, **kwargs):
inversed = lambda df: 1.0/df
df_temp = pd.concat([inversed(dfo.T.iloc[ilocx]),
inversed(dfo.T.iloc[ilocy])],
axis=1)
df_temp.plot(x=0,y=1, kind='scatter', **kwargs)
plt.show()
print('r = %f' % df_temp.corr().iloc[0,1])
def inverse_correl(dfo, ilocx, ilocy, **kwargs):
inversed = lambda df: 1.0/df
df_temp = pd.concat([inversed(dfo.T.iloc[ilocx]),
inversed(dfo.T.iloc[ilocy])],
axis=1)
print('r = %f' % df_temp.corr().iloc[0,1])
inverse_scatter(dfo, 73, 79, figsize=(5,5))